This Markdown file is for the report exercise of Chapter 9 (Supervised Machine learning I) of the AGDS Course. The exercise mostly centers around k-nearest-neighbors models, how they compare to linear regression when predicting GPP data by splitting data into training and test-datasets, and what role the k (number of neighbors) has on the models and how tuning that number influences modelling. This is done in part through visualization and metrics like RMSE and MAE to calculate the goodness of the models and analyze their bias-variance tradeoff.
daily_fluxes <- readr::read_csv("./data/FLX_CH-Dav_FLUXNET2015_FULLSET_DD_1997-2014_1-3.csv") |>
# select only the variables we are interested in
dplyr::select(TIMESTAMP,
GPP_NT_VUT_REF, # the target
ends_with("_QC"), # quality control info
ends_with("_F"), # includes all all meteorological covariates
-contains("JSB") # weird useless variable
) |>
# convert to a nice date object
dplyr::mutate(TIMESTAMP = ymd(TIMESTAMP)) |>
# set all -9999 to NA
dplyr::mutate(across(where(is.numeric), ~na_if(., -9999))) |>
# retain only data based on >=80% good-quality measurements
# overwrite bad data with NA (not dropping rows)
dplyr::mutate(GPP_NT_VUT_REF = ifelse(NEE_VUT_REF_QC < 0.8, NA, GPP_NT_VUT_REF),
TA_F = ifelse(TA_F_QC < 0.8, NA, TA_F),
SW_IN_F = ifelse(SW_IN_F_QC < 0.8, NA, SW_IN_F),
LW_IN_F = ifelse(LW_IN_F_QC < 0.8, NA, LW_IN_F),
VPD_F = ifelse(VPD_F_QC < 0.8, NA, VPD_F),
PA_F = ifelse(PA_F_QC < 0.8, NA, PA_F),
P_F = ifelse(P_F_QC < 0.8, NA, P_F),
WS_F = ifelse(WS_F_QC < 0.8, NA, WS_F)) |>
# drop QC variables (no longer needed)
dplyr::select(-ends_with("_QC"))
Ctrl+C, Ctrl+V from the chapter and clean up some issues
# Data splitting
set.seed(1982)
daily_fluxes <- daily_fluxes[!(colnames(daily_fluxes)) == "LW_IN_F"]
#due to the high amount of NA values in LW_IN_F, and the according to the script is a priori not critical for GPP predictions, we drop this variable
split <- rsample::initial_split(daily_fluxes, prop = 0.7, strata = "VPD_F")
daily_fluxes_train <- rsample::training(split)
daily_fluxes_test <- rsample::testing(split)
# Model and pre-processing formulation, use all variables but LW_IN_F
pp <- recipes::recipe(GPP_NT_VUT_REF ~ SW_IN_F + VPD_F + TA_F,
data = daily_fluxes_train |> drop_na()) |>
recipes::step_BoxCox(all_predictors()) |>
recipes::step_center(all_numeric(), -all_outcomes()) |>
recipes::step_scale(all_numeric(), -all_outcomes())
# Fit linear regression model
mod_lm <- caret::train(
pp,
data = daily_fluxes_train |> drop_na(),
method = "lm",
trControl = caret::trainControl(method = "none"),
metric = "RMSE"
)
summary(mod_lm)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1728 -1.0709 -0.1613 0.9008 7.3409
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.21581 0.02446 131.47 <2e-16 ***
## SW_IN_F 1.16519 0.03297 35.34 <2e-16 ***
## VPD_F -0.81775 0.03631 -22.52 <2e-16 ***
## TA_F 1.93384 0.03411 56.70 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.593 on 4238 degrees of freedom
## Multiple R-squared: 0.6645, Adjusted R-squared: 0.6642
## F-statistic: 2798 on 3 and 4238 DF, p-value: < 2.2e-16
# Fit KNN model
mod_knn <- caret::train(
pp,
data = daily_fluxes_train |> drop_na(),
method = "knn",
trControl = caret::trainControl(method = "none"),
tuneGrid = data.frame(k = 8),
metric = "RMSE"
)
#
source("./eval_model.R")
#linear regression model
eval_model(mod = mod_lm, df_train = daily_fluxes_train, df_test = daily_fluxes_test)
figure 1: Evaluation of linear regression model fit vs. GPP measurements.
# KNN
eval_model(mod = mod_knn, df_train = daily_fluxes_train, df_test = daily_fluxes_test)
figure 2: Evaluation of k-nearest-neighbor model fit vs. GPP
measurements.
#Do the thing that is within the function in eval_model so I can have the fits for the other exercises, since for reasons I do not comprehend due to my weak understanding of programming, the fits simply do not get appended to the dataframes when simply running eval_model (and maybe that's normal).
#for the training data
daily_fluxes_train <- daily_fluxes_train |>
drop_na()
daily_fluxes_train$lm_fitted <- predict(mod_lm, newdata = daily_fluxes_train)
daily_fluxes_train$knn_fitted <- predict(mod_knn, newdata = daily_fluxes_train)
#for the test data
daily_fluxes_test <- daily_fluxes_test |>
drop_na()
daily_fluxes_test$lm_fitted <- predict(mod_lm, newdata = daily_fluxes_test)
daily_fluxes_test$knn_fitted <- predict(mod_knn, newdata = daily_fluxes_test)
# combining the training and test data to prepare for ex.3
lm_fit_df <- data.frame(
c(daily_fluxes_train$TIMESTAMP, daily_fluxes_test$TIMESTAMP),
c(daily_fluxes_train$lm_fitted, daily_fluxes_test$lm_fitted))
names(lm_fit_df) <- c("TIMESTAMP", "fitted")
KNN_fit_df <- data.frame(
c(daily_fluxes_train$TIMESTAMP, daily_fluxes_test$TIMESTAMP),
c(daily_fluxes_train$knn_fitted, daily_fluxes_test$knn_fitted))
names(KNN_fit_df) <- c("TIMESTAMP", "fitted")
The KNN model could be slightly over- or underfitting for the training data, meaning when applying the model that works for the training data onto the testing data, it is a worse fit due to the slight “wiggle” that the line has. The lm on the other hand is just a fully straight line, which means it doesn’t fit as well in general but at the same time differences between training and testing data do not influence the RMSE much.
Because it is more flexible than “just a straight line” which is the lm.
Linear Regressions have a high bias, as they are not affected by the noise in observations, but its predictions are further away from the observations (just like model poly1 in the chapter) because it oversimplifies the model. KNN can be low bias and high variance when using a k which is too small, or high bias and low variance when using a k which is too big.
colors1 <- c("GPP Measured" = "black", "Linear fit" = "green", "KNN fit" = "red")
ggplot() +
geom_point(data = daily_fluxes, aes(x = TIMESTAMP, y = GPP_NT_VUT_REF, color = "GPP Measured"), alpha = 0.8) +
geom_line(data = lm_fit_df, aes(x = TIMESTAMP, y = fitted, color = "Linear fit"), alpha = 0.5) +
geom_line(data = KNN_fit_df, aes(x = TIMESTAMP, y = fitted, color = "KNN fit"), alpha = 0.5) +
scale_color_manual(values = colors1) +
labs(x = "Year", y = "GPP") +
theme(axis.text = element_text(size = 20), axis.title = element_text(size = 30)) +
theme(legend.key.size = unit(2, "cm"),
legend.text = element_text(size = 20), legend.title = element_blank())
figure 3: temporal variations of observed (black) and modelled GPP
with KNN (red) and linear regression (green) for the period
1997-2014.
ggplot() +
geom_point(data = daily_fluxes_train, aes(x = TIMESTAMP, y = GPP_NT_VUT_REF), color = 'red', alpha = 0.5) +
# geom_line(data = lm_model[[2]], aes(x = TIMESTAMP, y = fitted), color = 'green') +
geom_point(data = daily_fluxes_train, aes(x = TIMESTAMP, y = knn_fitted), color = 'blue', alpha = 0.5)
ggplot() +
geom_point(data = daily_fluxes_test, aes(x = TIMESTAMP, y = GPP_NT_VUT_REF), color = 'red', alpha = 0.5) +
geom_point(data = daily_fluxes_test, aes(x = TIMESTAMP, y = knn_fitted), color = 'blue', alpha = 0.5)
These plots are done to compare test and train model fits with the actual GPP measurements. With k = 1 it’s very visible how GPP and fit are exactly the same for the train graphic (except the few points where GPP could not be predicted due to NA values) but when comparing it to the test dataset, it is all over the place, signifying an overfitting onto the training dataset and thus low generalization of the fit.
k = 1 would mean that the line would run through all the training observation, leading to an R2 of 1.00 for the training data, but it would very likely have a bad R2 as well as mean error when applying to the testing dataset (overfitting, high variance).
k = N means a too generalized model, which should perform somewhat poorly in both R2 and MAE, and values for training and test models should be close each other (underfitting, high bias)
#first the loop
plots <- list()
moreplots <- list()
different_ks <- c(1, 2, 5, 8, 15, 50, 150, 300)
knn_diff_k <- list()
all_models <- list()
for (i in 1:length(different_ks)){
knn_diff_k[[i]] <- caret::train(
pp,
data = daily_fluxes_train |> drop_na(),
method = "knn",
trControl = caret::trainControl(method = "none"),
tuneGrid = data.frame(k = different_ks[i]),
metric = "RMSE")
all_models[[i]] <- eval_model(knn_diff_k[[i]], daily_fluxes_train, daily_fluxes_test)
daily_fluxes_train$knn_fitted <- predict(knn_diff_k[[i]], newdata = daily_fluxes_train)
plots[[i]] <- ggdraw(all_models[[i]]) + draw_label(paste("k =", different_ks[i]), x = 0.45, y = 0.96, color = 'red')
moreplots[[i]] <- ggplot() +
geom_point(data = daily_fluxes_train, aes(x = TIMESTAMP, y = GPP_NT_VUT_REF), color = 'red', alpha = 0.5) +
geom_point(data = daily_fluxes_train, aes(x = TIMESTAMP, y = knn_fitted), color = 'blue', alpha = 0.5) +
labs(x = "year", y = "GPP")
}
source("./plot_helper.R")
plots_draw(plots)
plots_draw(moreplots)
figure 4: GPP vs. model fit and evaluation for 8 different values
for k
#plots
moreplots
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
#also cowplot still does not work this confuses me a lot
#function to determine MAE based on k
compute_MAE <- function(k){
knn_diff_k <- caret::train(
pp,
data = daily_fluxes_train |> drop_na(),
method = "knn",
trControl = caret::trainControl(method = "none"),
tuneGrid = data.frame(k = k),
metric = "RMSE")
daily_fluxes_test$fitted <- predict(knn_diff_k, newdata = daily_fluxes_test)
metrics_test <- daily_fluxes_test |>
yardstick::metrics(GPP_NT_VUT_REF, fitted)
MAE_knn_test <- metrics_test |>
filter(.metric == "mae") |>
pull(.estimate)
return(MAE_knn_test)
}
for (i in 1:length(different_ks)){
print(compute_MAE(different_ks[i]))
}
## [1] 1.495793
## [1] 1.303756
## [1] 1.175692
## [1] 1.147647
## [1] 1.123898
## [1] 1.111188
## [1] 1.128479
## [1] 1.153778
Just like in the previous exercise, we use the Mean absolute error (MAE) on the test dataset, determining it for every between 0 and 100.
all_MAE <- c()
for (i in 1:100){
all_MAE[i] <- compute_MAE(i)
}
min(all_MAE)
## [1] 1.104253
which.min(all_MAE)
## [1] 32
#all_MAE <- data.frame(all_MAE)
ggplot(
data = data.frame(all_MAE), aes(x = 1:100, y = all_MAE)) +
geom_point() +
geom_point(aes(x = which.min(all_MAE),
y = min(all_MAE), color = 'red')) +
theme(legend.position = "none") +
labs(y = "Mean absolute error", x = "k")
Using the MAE as a metric and the test-datasets for various k, it seems as k = 32 is the optimal value for highest generalisability, as it has the lowest Mean Absolute Error for the test data. However, it is hard to come to a simple conclusion using just this somewhat rigid dataset without any re-sampling or other metric.